home *** CD-ROM | disk | FTP | other *** search
Text File | 1997-09-11 | 10.3 KB | 324 lines | [TEXT/CWIE] |
- // TermIndex.h
- // Copyright: © 1994 - 1996 by Apple Computer, Inc., all rights reserved.
-
- //// TermIndex: a base class for both vector and inverted indices.
- //// Maintains a dictionary of terms.
-
- #pragma once
- #ifndef TermIndex_h
- #define TermIndex_h
-
- #pragma import on
-
- #include "IAIndex.h"
-
- //#pragma IA_BEGIN_IMPORTS
- #include <time.h>
- //#pragma IA_END_IMPORTS
-
- #pragma IA_BEGIN_EXPORTS
-
- typedef uint32 TermID;
- typedef uint32 TermFreq;
- //typedef uint32 TermOffset;
-
- typedef TermID DocID;
- typedef TermFreq DocLength;
-
-
- const uint32 TermIndexType = 'Ter2';
-
- //// TermInfo: statistics about a term's occurrence.
-
- // Subclasses augment, in particular to store postings, forming an inverted index.
- class TermInfo : public IAOrderedStorable {
- public:
- TermInfo(); // term = NULL;
- TermInfo(IATerm* t, TermID i) : term(t), id(i), docCount(0) {}
- IA_INLINE ~TermInfo() IA_INLINE_DEF_BODY(delete term)
-
-
- // IAOrderedStorable methods
- IAStorable* DeepCopy() const;
- IABlockSize StoreSize() const;
- void Store(IAOutputBlock* output) const;
- IAStorable* Restore(IAInputBlock* input) const;
- bool LessThan(const IAOrderedStorable* other) const; // return term->LessThanNonVirtual(other->term);
- bool Equal(const IAOrderedStorable* other) const; // return term->EqualNonVirtual(other->term);
- IATerm* GetTerm() const {return term;}
- void SetTerm(IATerm* t) {term = t;}
- TermID GetTermID() const {return id;}
- void SetDocumentCount(TermFreq dCount) {docCount = dCount;}
- TermFreq GetDocumentCount() const {return docCount;}
-
- protected:
- void DeepCopying(const IAStorable* source);
- void Restoring(IAInputBlock* input, const IAStorable* proto);
- private:
- TermInfo(TermInfo&); // don't define a copy constructor
-
- IATerm* term; // the term
- TermID id; // its ID
- TermFreq docCount; // the number of docs it occurs in
-
- };
-
- //// DocInfo: statistics about a document.
-
- // Subclasses augment, in particular to store vectors.
- class DocInfo : public IAOrderedStorable {
- public:
- DocInfo() : doc(NULL) {}
- DocInfo(IADoc* d, DocID i) : doc(d), id(i), length(0) {}
- IA_INLINE ~DocInfo() IA_INLINE_DEF_BODY(delete doc)
-
-
- // IAOrderedStorable methods
- IAStorable* DeepCopy() const;
- IABlockSize StoreSize() const;
- void Store(IAOutputBlock* output) const;
- IAStorable* Restore(IAInputBlock* input) const;
- bool LessThan(const IAOrderedStorable* other) const; // return doc->LessThan(other->doc);
- bool Equal(const IAOrderedStorable* other) const; // return doc->Equal(other->doc);
- IADoc* GetDocument() const {return doc;}
- DocID GetDocID() const {return id;}
- void SetDocumentLength(DocLength dlength) {length = dlength;}
- DocLength GetDocumentLength() const {return length;}
- void SetDocument(IADoc* d) {doc = d;}
- protected:
- void DeepCopying(const IAStorable* source);
- void Restoring(IAInputBlock* input, const IAStorable* proto);
- private:
- DocInfo(DocInfo&); // don't define a copy constructor
-
- IADoc* doc; // the document
- DocID id; // its ID
- DocLength length; // the number of terms in the doc
-
- };
-
- // internal types needed for private member declarations
- struct FreqPosting;
- class TFVector;
- class TFMap;
- class Progress;
- class TermUpdateSet;
- class DocUpdateSet;
-
- // flush progress report support
- typedef void FlushProgressFn(float percent, void* data);
-
- //// TermIndex: base class for vector & inverted indices
-
- class TermIndex : public IAIndex {
- friend class IAQuery; // for GetTFMaps()
- public:
- TermIndex(IAStorage* s, IACorpus* c, IAAnalysis* a,
- uint32 t = TermIndexType, IABlockID r = NULL);
- ~TermIndex();
-
- // IAIndex methods we'll modify
- void Initialize();
- void Open();
-
- void Flush();
-
- void AddDoc(IADoc* doc);
- void AddDoc(IADoc* doc, DocID* returnID);
- void RenameDoc(const IADoc* oldName, const IADoc* newName);
- void DeleteDoc(const IADoc* doc);
-
- bool IsDocIndexed(const IADoc* doc);
-
- IADocIterator* GetDocIterator();
- IADocIterator* GetDocIterator(const IADoc* start);
-
- void Merge(IAIndex** indices, uint32 indexCount);
-
- // TermIndex-specific methods
-
- virtual void Purge();
- // TermInfo's & ID's
- virtual TermInfo* GetTermInfo(IATerm* term);
-
- IATerm* GetIDTerm(TermID id);
- TermID GetMaxTermID();
- TermFreq GetTermCount();
-
- IAOrderedStorableIterator* GetTermInfoIterator();
- IAOrderedStorableIterator* GetTermInfoIterator(IATerm* start);
-
- // DocInfo's & ID's
- DocInfo* GetDocInfo(const IADoc* doc, bool noError = false);
-
- IADoc* GetIDDoc(DocID id);
- DocID GetMaxDocID();
- DocID GetDocCount();
-
- IAOrderedStorableIterator* GetDocInfoIterator();
- IAOrderedStorableIterator* GetDocInfoIterator(const IADoc* start);
-
- // Called under AddDoc(), DeleteDoc() and Flush().
- void SetFlushProgressFn(FlushProgressFn* fn) {flushProgressFn = fn;}
- FlushProgressFn* GetFlushProgressFn() const {return flushProgressFn;}
- void SetFlushProgressData(void* pdata) {flushProgressData = pdata;}
- void* GetFlushProgressData() const {return flushProgressData;}
- void SetFlushProgressFreq(clock_t freq) {flushProgressFreq = freq;}
- clock_t GetFlushProgressFreq() const {return flushProgressFreq;}
-
-
-
-
- IADefineNarrowMethods(TermIndex, IAIndex); // support for IANarrow
-
- bool Validate(bool verbose = true, bool justDocs = true); // validate this inddex
- virtual bool ValidateTermInfos(bool verbose);
- virtual bool ValidateDocInfos(bool verbose);
-
- virtual uint32 GetNorm();
- protected:
- void Initializing();
- // methods which add some data to the index root block
- IABlockSize RootSize();
- void StoreRoot(IAOutputBlock* output);
- void RestoreRoot(IAInputBlock* input);
- // To be augmented by subclasses:
- // . for inverted indices:
- virtual TermInfo* MakeTermInfo(IATerm* term, TermID id);
- virtual TermInfo* UpdateTermInfo(TermInfo* i, FreqPosting* adds, TermFreq addCount, TermFreq delCount);
- virtual TermInfo* MergeTermInfo(TermInfo* i, TermInfo* addTi, TermIndex* addIndex, DocID base);
- virtual void MergeDocIDs(TermIndex* addIndex, DocID base);
- virtual uint32 MergeTermCost(); // used for merge progress estimations
- virtual uint32 MergeDocCost(); // used for merge progress estimations
- // . vector indices
- virtual DocInfo* MakeDocInfo(IADoc* doc, DocID docID);
- virtual void UpdateDocInfo(DocInfo* i, TFVector* vector);
- virtual void MergeDocInfo(DocInfo* i, DocInfo* addDi, TermIndex* addIndex, TermID* reMap);
-
- // update support
- virtual void FinishingUpdate();
- virtual void DeletingDoc(DocInfo* docInfo);
- virtual void DeletingPostings(IATerm* term, TermFreq delCount);
- virtual void FlushUpdates();
- // delete doc clean up
- virtual void ResetPostings(TermInfo* terminfo);
-
- // Used by RankedQuery. Default impl here retokenizes, optimized by VectorIndex.
- virtual bool GetTFMaps(IADoc** docs, uint32 nDocs, TFMap** tfMaps, Progress* p);
-
- // default constructor etc. so that this can be a virtual base class
- TermIndex();
- void Constructing(IAStorage* s, IACorpus* c, IAAnalysis* a, uint32 t, IABlockID r);
- IAOrderedStorableSet* GetTermInfoSet() const {return termInfoSet;}
- IAOrderedStorableSet* GetiDTermMap() const {return iDTermMap;}
- void SetUpdateCount(uint32 cnt) {updateCount = cnt;}
- uint32 GetUpdateCount() const {return updateCount;}
-
- void SetBytesForUpdate(uint32 bupdate) {bytesForUpdate = bupdate;} // amount of memory available
- uint32 GetBytesForUpdate() const {return bytesForUpdate;}
-
- private:
- void MaybeFlushUpdates();
- void FlushTermUpdates(Progress* prog);
- void FlushDocUpdates(Progress* prog);
-
- void MergeTermInfos(IAIndex** indices, DocID* bases, Progress* prog);
- void MergeDocInfos(IAIndex** indices, DocID* bases, Progress* prog);
-
- // The substance of the index.
- IAOrderedStorableSet* docInfoSet;
- IAOrderedStorableSet* iDDocMap;
- IAOrderedStorableSet* termInfoSet;
- IAOrderedStorableSet* iDTermMap;
- IAOrderedStorableSet** mergeMaps; // NULL except while merging
-
- uint32 updateCount;
-
- // queued updates.
- TermUpdateSet* termUpdateSet;
- DocUpdateSet* docUpdateSet;
-
- // persistent slots -- written in root
- IABlockID docInfoSetRoot;
- IABlockID iDDocMapRoot;
- IABlockID termInfoSetRoot;
- IABlockID iDTermMapRoot;
- DocID maxDocID;
- TermID maxTermID;
- uint32 mergeMapCount; // NULL except while merging
- uint32* mergeMapRoots; // NULL except while merging
-
- FlushProgressFn* flushProgressFn;
- void* flushProgressData;
- clock_t flushProgressFreq;
-
- uint32 bytesForUpdate; // amount of memory available
-
-
- TermIndex(TermIndex&); // don't define a copy constructor
- };
-
-
- IAExceptionCode InvalidDocID = 'VIID';
- IAExceptionCode InvalidTermID = 'VIIT';
-
-
- //// IDTerm: used in iDTermMap to map from TermIDs back to Terms -- ordered by decreasing id
- class IDTerm : public IAOrderedStorable {
- public:
- IDTerm(); // term = NULL;
- IDTerm(TermID i, IATerm* t) : id(i), term(t) {}
- ~IDTerm(); // delete term;
-
- // IAOrderedStorable methods
- IAStorable* DeepCopy() const;
- IABlockSize StoreSize() const;
- void Store(IAOutputBlock* out) const;
- IAStorable* Restore(IAInputBlock* in) const;
- bool LessThan(const IAOrderedStorable* other) const; // return id > ((IDTerm*)other)->id;
- bool Equal(const IAOrderedStorable* other) const; // return id == ((IDTerm*)other)->id;
- TermID GetTermID() const {return id;}
- IATerm* GetTerm() const {return term;}
- void SetTerm(IATerm* t) {term = t;}
- void SetTermID(TermID ID){ id = ID;}
-
- private:
- IDTerm(IDTerm&); // don't define a copy constructor
-
- TermID id; // the id
- IATerm* term; // and corresponding term
-
-
- };
-
-
- //// IDDoc: used in iDDocMap to map from DocIDs back to docs -- ordered by decreasing id
- class IDDoc : public IAOrderedStorable {
- public:
- IDDoc(); // doc = NULL;
- IDDoc(DocID i, IADoc* d) : id(i), doc(d) {}
- ~IDDoc(); // delete doc;
-
-
- // IAOrderedStorable methods
- IAStorable* DeepCopy() const;
- IABlockSize StoreSize() const;
- void Store(IAOutputBlock* out) const;
- IAStorable* Restore(IAInputBlock* in) const;
- bool LessThan(const IAOrderedStorable* other) const; // return id > ((IDDoc*)other)->id;
- bool Equal(const IAOrderedStorable* other) const; // return id == ((IDDoc*)other)->id;
- DocID GetDocID() const {return id;}
- IADoc* GetDocument() const {return doc;}
- void SetDocument(IADoc* d) {doc = d;}
- private:
- IDDoc(IDDoc&); // don't define a copy constructor
- DocID id; // the ID
- IADoc* doc; // the document
-
- };
-
- #pragma IA_END_EXPORTS
-
- #pragma import reset
- #endif
-